setwd(‘/Users/ajdonna/Desktop/CoursesNpractice/FaceBook Data’)

FB_Year <- read.csv('Yearly_FB_data.csv')

summary(FB_Year)
##       YEAR        Mean_Likes     Median_Likes    Mean_Comments  
##  Min.   :2011   Min.   : 4.00   Min.   : 4.000   Min.   :0.000  
##  1st Qu.:2012   1st Qu.: 7.00   1st Qu.: 4.000   1st Qu.:1.000  
##  Median :2014   Median :13.00   Median : 8.000   Median :2.000  
##  Mean   :2014   Mean   :16.57   Mean   : 9.571   Mean   :2.429  
##  3rd Qu.:2016   3rd Qu.:27.50   3rd Qu.:14.000   3rd Qu.:4.000  
##  Max.   :2017   Max.   :30.00   Max.   :19.000   Max.   :5.000  
##  Median_Comments      Likes         Comments         Posts      
##  Min.   :0.0000   Min.   :  62   Min.   : 12.0   Min.   : 14.0  
##  1st Qu.:0.0000   1st Qu.:1002   1st Qu.:140.0   1st Qu.: 72.5  
##  Median :0.0000   Median :2856   Median :186.0   Median :125.0  
##  Mean   :0.8571   Mean   :2314   Mean   :316.6   Mean   :151.1  
##  3rd Qu.:1.5000   3rd Qu.:3373   3rd Qu.:463.0   3rd Qu.:183.0  
##  Max.   :3.0000   Max.   :4534   Max.   :812.0   Max.   :408.0  
##    TYPE_VIDEO       TYPE_LINK       TYPE_PHOTO     TYPE_STATUS    
##  Min.   :  0.00   Min.   : 0.00   Min.   : 5.00   Min.   :  9.00  
##  1st Qu.:  0.50   1st Qu.:11.00   1st Qu.:33.50   1st Qu.: 11.00  
##  Median :  1.00   Median :23.00   Median :49.00   Median : 18.00  
##  Mean   : 53.57   Mean   :19.14   Mean   :42.57   Mean   : 35.71  
##  3rd Qu.: 26.50   3rd Qu.:27.00   3rd Qu.:56.50   3rd Qu.: 38.50  
##  Max.   :320.00   Max.   :35.00   Max.   :64.00   Max.   :124.00
str(FB_Year)
## 'data.frame':    7 obs. of  12 variables:
##  $ YEAR           : int  2011 2012 2013 2014 2015 2016 2017
##  $ Mean_Likes     : int  4 5 13 25 30 30 9
##  $ Median_Likes   : int  4 4 8 19 19 9 4
##  $ Mean_Comments  : int  1 1 4 5 4 2 0
##  $ Median_Comments: int  0 0 1 3 2 0 0
##  $ Likes          : int  62 506 2856 3162 1497 4534 3584
##  $ Comments       : int  12 132 812 661 186 265 148
##  $ Posts          : int  14 95 217 125 50 149 408
##  $ TYPE_VIDEO     : int  0 0 1 1 6 47 320
##  $ TYPE_LINK      : int  0 16 28 23 6 35 26
##  $ TYPE_PHOTO     : int  5 47 64 55 20 58 49
##  $ TYPE_STATUS    : int  9 32 124 45 18 9 13
FB_Year$f_YEAR <- factor(FB_Year$YEAR)

Analyzing the #Posts with Years

library(ggplot2)

ggplot(data = FB_Year, aes(YEAR, Posts))+
  geom_line()+
  scale_x_continuous(breaks= seq(2011,2017,1))+
  scale_y_continuous(breaks= seq(0,800,100))+
  geom_line(data = FB_Year,aes(YEAR, Comments),color = 'blue')

Analyzing the Same data for Mean Likes and Comments

ggplot(data = FB_Year,aes(YEAR, Mean_Likes))+
  geom_line()+
  ylab("Mean_Likes/Mean_Comments")+
  scale_y_continuous(breaks=seq(0,30,5))+
  scale_x_continuous(breaks= seq(2011,2017,1))+
  geom_line(data = FB_Year,aes(YEAR, Mean_Comments),color = 'red')

Analyzing the Comments data as per the Year

ggplot(data = FB_Year,aes(YEAR, log2(Comments)))+
  geom_line()+
  scale_x_continuous(breaks= seq(2011,2017,1))+
  geom_line(data = FB_Year,aes(YEAR, Mean_Comments),color = 'red')+
  geom_line(data = FB_Year,aes(YEAR, Median_Comments),color = 'blue')

Loading the Initial Dataset

FB <- read.csv('FBposts_Words.csv')

str(FB)
## 'data.frame':    1075 obs. of  21 variables:
##  $ ID      : Factor w/ 1073 levels "1405174859572227_1000278100061907",..: 775 775 572 586 585 583 580 579 578 576 ...
##  $ DAY     : int  31 31 30 29 24 8 25 25 25 19 ...
##  $ MONTH   : int  12 12 12 12 12 12 10 10 10 10 ...
##  $ YEAR    : int  2010 2010 2011 2011 2011 2011 2011 2011 2011 2011 ...
##  $ DATE    : Factor w/ 577 levels "1/1/13","1/1/15",..: 171 171 169 164 158 179 85 85 85 74 ...
##  $ HOUR    : int  20 20 5 3 9 7 8 8 8 9 ...
##  $ MIN     : int  0 0 29 54 33 59 40 36 7 39 ...
##  $ SEC     : int  0 0 49 29 4 4 52 27 44 43 ...
##  $ TIME    : Factor w/ 1060 levels "00:00:59","00:02:16",..: 925 925 166 88 322 279 305 303 285 324 ...
##  $ TYPE    : Factor w/ 5 levels "link","note",..: 1 1 3 3 4 3 4 3 4 4 ...
##  $ LIKES   : int  0 0 5 1 5 0 0 3 6 3 ...
##  $ COMMENTS: int  0 0 0 0 1 0 0 0 2 3 ...
##  $ POS     : num  0 0 0.667 0 0.045 0 0 0 0.355 0.213 ...
##  $ NEG     : num  0 0 0 0 0.138 0 0 0 0.041 0 ...
##  $ NEU     : num  0 0 0.333 0 0.817 0 0 0 0.604 0.787 ...
##  $ COMP    : num  0 0 0.612 0 -0.964 0 0 0 0.96 0.852 ...
##  $ WORDS   : int  0 0 0 0 94 0 0 0 22 24 ...
##  $ X       : int  NA 0 NA NA NA NA NA NA NA NA ...
##  $ X.1     : int  NA 0 NA NA NA NA NA NA NA NA ...
##  $ X.2     : int  NA 0 NA NA NA NA NA NA NA NA ...
##  $ X.3     : int  NA 0 NA NA NA NA NA NA NA NA ...
FB$fYear<- factor(FB$YEAR)
FB$fMonth<- factor(FB$MONTH)

summary(FB$COMMENTS)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   2.085   2.000  70.000
summary(FB$LIKES)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    3.00    6.00   15.37   15.00  233.00

Analyzing relationship between LIKES and COMMENTS over the course of Years

ggplot(data=subset(FB,YEAR!=2010),aes(COMMENTS, LIKES))+
  geom_point(aes(color = fYear))+
  scale_x_continuous(breaks=seq(0,20,2),limits = c(0,20))+
  scale_y_continuous(limits = c(0,125))
## Warning: Removed 21 rows containing missing values (geom_point).

Notes: From the graph it is clear that the posts from Latest Years are the ones to garner more no. of likes as compared to the posts to have received high number of Comments which are from Early Years!

Putting the factor of NUMBER OF WORDS used in the post

ggplot(data=subset(FB,YEAR!=2010),aes(COMMENTS, LIKES))+
  geom_point(aes(color = fYear, size=WORDS))+
  scale_x_continuous(breaks=seq(0,20,2),limits = c(0,20))+
  scale_y_continuous(limits = c(0,125))
## Warning: Removed 21 rows containing missing values (geom_point).

Divide the Length of the Post in 5 Categories

FB$Words_Level <- factor(cut(FB$WORDS, breaks = c(-1,25,50,75,100,226), labels = c("Vlow","Low","Med","High","Vhigh")))

Trying to get a clear picture of the relationship between WORDS and LIKES

ggplot(data=subset(FB,YEAR!=2010),aes(COMMENTS, LIKES))+
  geom_point(aes(color = Words_Level))+
  facet_wrap(~YEAR,scales="free")

Notes: It appears that beginning from year 2013, the posts with VERY LOW words(<25) or LOW(<50) have been getting more LIKES as compared to COMMENTS, with some outliers here and there in the form of words with more number of WORDS!

Distribution of LIKES, COMMENTS with the week of the month

FB$Week <- cut(FB$DAY, breaks = c(0,8,15,22,28,31), labels = 1:5)

ggplot(data=subset(FB,YEAR!=2010),aes(Week))+
  geom_histogram(stat="count",color="black",fill="#678912")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

Notes: From this plot, it becomes evident that No. of posts spread across Weeks of a month are similar in distribution except for the week 5! Let’s see if the distribution is same across all the years.

ggplot(data=subset(FB,YEAR!=2010),aes(Week))+
  geom_histogram(stat="count",color="black",fill="#678912")+
  facet_wrap(~YEAR, scales="free")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

Notes: From 2012 through 2016, the maximum no. of posts were shared during the first week of the month, with approximately same number of posts shared across the weeks except the last week(5th). But it is the scale of No. of Posts which differs considerably from one year to another. Year 2013’s scale is almost double to that in year 2012, 2014 and 2016. But Year 2017, the scale has already taken a 4 times jump!

ggplot(data=subset(FB,YEAR!=2010 & TYPE!='note'),aes(Week))+
  geom_histogram(stat="count",aes(fill=TYPE),color="black")+
  facet_wrap(~YEAR, scales="free")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

Notes: For Year 2012-> Photo, 2013-> Status, 2014-> Status & Photo, 2015 -> Status&Photo, 2017 -> Video. It is clear that it was in 2015 that considerable number of posts were shared across all the 4 categories. Prior to that it was either a clash between Photo and status in terms of more number of posts or either one of them dominated.

Distribution of WORDS across years

summary(FB$WORDS)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0     0.0    16.8    23.0   226.0
ggplot(data=subset(FB, FB$YEAR!='2010'),aes(fYear, WORDS))+
  geom_boxplot()+
  ylim(0,quantile(FB$WORDS,0.98))
## Warning: Removed 22 rows containing non-finite values (stat_boxplot).

Summarize the Distribution of the LIKES as per the No. of WORDS used

ggplot(data=subset(FB, FB$YEAR!='2010'),aes(Words_Level, LIKES))+
  geom_boxplot()+
  ylim(0,quantile(FB$LIKES,0.95))
## Warning: Removed 54 rows containing non-finite values (stat_boxplot).

Notes: Except for the “Very LOW” category of words, all the other categories seem to have same number of Median Likes except for the VERY HIGH category of words(100+), which has the highest median as compared to all others

ggplot(data=subset(FB, FB$YEAR!='2010'),aes(Words_Level, LIKES))+
  geom_boxplot()+
  ylim(0,quantile(FB$LIKES,0.95))+
  facet_wrap(~YEAR, scales="free")
## Warning: Removed 54 rows containing non-finite values (stat_boxplot).

Notes: 2012,2013 is the only year where there has been a consistent increase in the median LIKES with increase in number of words. 2014,2015 and 2016 are the 3 years, which register the Highest Median Likes for the LOW number of words used in the posts! In 2015-LOW, the Median Likes reach its zenith, close to 50. 2017 is the only year when there is an consistency in the number of likes across all the WORD Categories!

Let’s explore this relationship further

ggplot(data=subset(FB,YEAR!=2010),aes(Words_Level))+
  xlab("Number of Words")+
  geom_histogram(stat="count")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

Faceting the Words Distribution across the Year

ggplot(data=subset(FB,YEAR!=2010),aes(Words_Level))+
  xlab("Number of Words")+
  geom_histogram(stat="count")+
  facet_wrap(~YEAR,scales = "free")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

Notes: For every year, the highest number of posts fall in the cateogory of having used Very Low number of Words! For 2017, the distribution is very sparse across the categories with number of POSTS very low words being Alarmingly High. That’s probably the reason there was a Increase in the Median Likes for year 2017! Less No of POSTS!

Let’s find out why there are less no of posts with MORE Words

ggplot(data=subset(FB, YEAR!=2010 & TYPE!='note'),aes(TYPE))+
  geom_histogram(stat="count",color="black",fill="#34A389")+
  facet_wrap(~YEAR,scales="free")+
  xlab("Status Type")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

Notes: Importantly, there has been a gradual increase in the Video Status being shared with the years.The VIDEO status being shared began in 2013 and it increased steadily with zenith currently in 2017! The distribution across all other categories has been more or less the same throughout the years except for 2015 which registered a serious dip in the overall statuses and 2013 which saw a significant increase in the status, reaching to a maximum of 120.

Let’s see the trend through Linear Graphs

ggplot(data=subset(FB, YEAR!=2010 & TYPE!='note'),aes(YEAR))+
  geom_freqpoly(stat="count",aes(color=TYPE))+
  ylab("POSTS")+
  scale_x_continuous(breaks=seq(2011,2017,1))+
  scale_y_continuous(breaks = seq(0,300,50))

Notes: As visible, there is a steep increase in the number of Videos being shared in year 2017

Analyzing LIKES vs STATUS TYPE

ggplot(data=subset(FB,YEAR!=2010 & TYPE!='note'),aes(WORDS,LIKES))+
  geom_point(aes(color=TYPE))+
  scale_y_continuous(breaks=seq(0,125,25),limits=c(0,125))+
  scale_x_continuous(breaks=seq(0,150,25),limits=c(0,150))+
  geom_hline(yintercept = mean(FB$LIKES),color='black')+
  geom_hline(yintercept = median(FB$LIKES),color='red')
## Warning: Removed 11 rows containing missing values (geom_point).

Notes: The PHOTO Status are way ahead in generating LIKES followed by STATUS and Likes for LINK and VIDEO.

Faceting across several Years

ggplot(data=subset(FB,YEAR!=2010 & TYPE!='note'),aes(WORDS,LIKES))+
  geom_point(aes(color=TYPE))+
  geom_hline(yintercept = mean(FB$LIKES),color='black')+
  geom_hline(yintercept = median(FB$LIKES),color='red')+
  facet_wrap(~YEAR,scales="free")

Notes: Distribution of Likes is same as that of Distribution of Comments

by(FB$Words_Level, FB$YEAR, summary)
## FB$YEAR: 2010
##  Vlow   Low   Med  High Vhigh 
##     2     0     0     0     0 
## -------------------------------------------------------- 
## FB$YEAR: 2011
##  Vlow   Low   Med  High Vhigh 
##    10     2     0     2     0 
## -------------------------------------------------------- 
## FB$YEAR: 2012
##  Vlow   Low   Med  High Vhigh 
##    65     7     8    10     5 
## -------------------------------------------------------- 
## FB$YEAR: 2013
##  Vlow   Low   Med  High Vhigh 
##   135    49    25     6     2 
## -------------------------------------------------------- 
## FB$YEAR: 2014
##  Vlow   Low   Med  High Vhigh 
##    76    14    14    15     7 
## -------------------------------------------------------- 
## FB$YEAR: 2015
##  Vlow   Low   Med  High Vhigh 
##    32     5     5     1     7 
## -------------------------------------------------------- 
## FB$YEAR: 2016
##  Vlow   Low   Med  High Vhigh 
##   113    16     5     8     7 
## -------------------------------------------------------- 
## FB$YEAR: 2017
##  Vlow   Low   Med  High Vhigh 
##   391    10    11     6     4

Analyzing the relationship between Words and Comments

ggplot(data=subset(FB, FB$YEAR!='2010'),aes(Words_Level, COMMENTS))+
  geom_boxplot()+
  ylim(0,quantile(FB$COMMENTS,0.98))
## Warning: Removed 22 rows containing non-finite values (stat_boxplot).

Notes: The Median comments are pretty same for every word category except when the number of Words are Very High, the Median comments shoot to 5 varying from 2.5 otherwise!

by(FB$COMMENTS,FB$Words_Level, summary)
## FB$Words_Level: Vlow
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   1.067   1.000  70.000 
## -------------------------------------------------------- 
## FB$Words_Level: Low
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   3.000   4.214   6.500  29.000 
## -------------------------------------------------------- 
## FB$Words_Level: Med
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   4.000   6.044   8.000  46.000 
## -------------------------------------------------------- 
## FB$Words_Level: High
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   3.000   5.562   7.000  48.000 
## -------------------------------------------------------- 
## FB$Words_Level: Vhigh
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.750   5.000   7.812   9.500  38.000

Facet Wrapping the same with the YEAR

ggplot(data=subset(FB, FB$YEAR!='2010'),aes(Words_Level, COMMENTS))+
  geom_boxplot()+
  ylim(0,quantile(FB$COMMENTS,0.98))+
  facet_wrap(~YEAR,scales="free")
## Warning: Removed 22 rows containing non-finite values (stat_boxplot).

Notes: The range of number of comments is same for all the years as evident by the same Y-scale across all the years. 2014 is the only year when there is considerable number of median comments spread across all Word-Categories! The highest value of Median Comments happens in 2015 for the High category! Let’s see what’s the reason behind the nice Comment trends in year 2014!

ggplot(data=subset(FB,YEAR!=2010),aes(Words_Level))+
  xlab("Number of Words")+
  geom_histogram(stat="count",color='black',fill='#565689')+
  facet_wrap(~YEAR,scales="free")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

Let’s see the variation of COMMENTS with POST Type

FB$TYPE <- as.character(FB$TYPE)
ggplot(data=subset(FB,YEAR!=2010 & TYPE!='note'),aes(WORDS,COMMENTS))+
  geom_point(aes(color=TYPE))+
  scale_y_continuous(breaks=seq(0,40,10),limits=c(0,40))+
  scale_x_continuous(breaks=seq(0,150,25),limits=c(0,150))+
  geom_hline(yintercept = mean(FB$COMMENTS),color='black')+
  geom_hline(yintercept = median(FB$COMMENTS),color='red')
## Warning: Removed 6 rows containing missing values (geom_point).

Notes: It is visible that High number of Comments are mostly attributed to Posts of TYPE:STATUS. The POSTS of type LINK, VIDEO are not the ones to have got many Comments! Let’s see the above distribution across the Years

ggplot(data=subset(FB,YEAR!=2010 & TYPE!='note'),aes(WORDS,COMMENTS))+
  geom_point(aes(color=TYPE))+
  facet_wrap(~YEAR,scales="free")+
  geom_hline(yintercept = mean(FB$COMMENTS),color='black')+
  geom_hline(yintercept = median(FB$COMMENTS),color='red')

Notes: It shows that Year 2013,2014 and 2015 were all about STATUSES with POSTS varying in length from too Few words to too Many! FOr 2016 and 2017, it were the PHOTOS to have received majority of comments! Even though, the Number of Videos were significantly higher than photos in 2017 and comparable in 2016.

Analyzing multiple variables simulateneously

ggplot(data=subset(FB,YEAR!=2010 &YEAR!=2011 & TYPE!='note'),aes(MONTH,LIKES))+
  geom_point(aes(color=TYPE, size=WORDS))+
  scale_x_continuous(breaks=seq(1,12,1))+
  ylim(0,quantile(FB$LIKES,0.98))+
  geom_freqpoly(stat='summary',fun.y=mean)+
  facet_wrap(~YEAR,scales="free")
## Warning: Removed 21 rows containing non-finite values (stat_summary).
## Warning: Removed 21 rows containing missing values (geom_point).

Notes: This plot provides several insights into the overall process!As observed earlier, year 2013, 2014 and 2015 were dominant by STATUS with overall no. of posts being low in 2015. With the passage of years, the Number of Likes for posts with More Words has increased. From 2013 to 2016 and right into 2017, the trend is continuing. Talking about Mean LIKES, they have almost been uniform across the months except for year 2015 and 2016!Particularly in 2016, there was a steep dip in the Month of May, which was the time when I quit my job to go for further studies. And as evident, the LIKES thereafter have been marginally more than the previous ones!